stuff
i 76 We .
# Create a map containing boundary of Chicago.
chicagoBoundary <-
st_read(file.path(root.dir,"/Chapter5/chicagoBoundary.geojson")) %>%
st_transform('ESRI:102271')
# Create a fishnet containing 500 by 500 foot squares over the boundary of Chicago.
fishnet <-
st_make_grid(chicagoBoundary,
cellsize = 500,
square = TRUE) %>%
.[chicagoBoundary] %>% # <- MDH Added
st_sf() %>%
mutate(uniqueID = rownames(.))
## Neighborhoods to use
neighborhoods <-
st_read("https://raw.githubusercontent.com/blackmad/neighborhoods/master/chicago.geojson") %>%
st_transform(st_crs(fishnet))
FoodInspect <-
read.socrata("https://data.cityofchicago.org/Health-Human-Services/Food-Inspections/4ijn-s7e5") %>%
mutate(year = substr(inspection_date,1,4)) %>% filter(year == "2018") %>%
# filter(results=="Fail") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Food_Inspect")
Stuff
# uses grid.arrange to organize indpendent plots
grid.arrange(ncol=2,
ggplot() +
geom_sf(data = chicagoBoundary) +
geom_sf(data = FoodInspect, colour="red", size=0.1, show.legend = "point") +
labs(title= "Food Inspections, Chicago - 2017") +
mapTheme(title_size = 14),
ggplot() +
geom_sf(data = chicagoBoundary, fill = "grey80") +
stat_density2d(data = data.frame(st_coordinates(FoodInspect)),
aes(X, Y, fill = ..level.., alpha = ..level..),
size = 0.01, bins = 40, geom = 'polygon') +
scale_fill_viridis() +
scale_alpha(range = c(0.00, 0.35), guide = FALSE) +
labs(title = "Density of Food Inspections") +
mapTheme(title_size = 14) + theme(legend.position = "none"))
Inspections are then attributed to each fishnet cell.
## add a value of 1 to each crime, sum them with aggregate
inspect_net <-
dplyr::select(FoodInspect) %>%
mutate(countInspect = 1) %>%
aggregate(., fishnet, sum) %>%
mutate(countInspect = replace_na(countInspect, 0),
uniqueID = rownames(.),
cvID = sample(round(nrow(fishnet) / 24),
size=nrow(fishnet), replace = TRUE))
ggplot() +
geom_sf(data = inspect_net, aes(fill = countInspect), color = NA) +
scale_fill_viridis() +
labs(title = "Count of Inspections for the Fishnet") +
mapTheme()
Stuff
## using Socrata again
rodentBait <-
read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Rodent-Baiting-Historic al/97t6-zrhs") %>%
mutate(year = substr(creation_date,1,4)) %>% filter(year == "2018") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Rodent_Bait")
sanitation311 <-
read.socrata(paste0("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Sanitation-Code-Complaints-Hi/me59-5fac")) %>%
mutate(year = substr(creation_date,1,4)) %>%
filter(year == "2018") %>%
filter(what_is_the_nature_of_this_code_violation_ %in%
c("Garbage in Yard","Garbage in Alley","Dumpster not being emptied", "Overflowing carts", "Construction Site Cleanliness/Fence", "Standing water")) %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X","Y"), crs=4326, agr="constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Sanitation_311")
ordinanceViolation <-
read.socrata("https://data.cityofchicago.org/Administration-Finance/Ordinance-Violations-Buildings-/awqx-tuwv") %>%
mutate(year = substr(VIOLATION.DATE,1,4)) %>% filter(year == "2018") %>%
dplyr::select(Y = LATITUDE, X = LONGITUDE) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Ordinance _Violations")
graffiti <-
read.socrata(paste0("https://data.cityofchicago.org/Service",
"-Requests/311-Service-Requests-Graffiti-Removal-Historical/",
"hec5-y4x5")) %>%
mutate(year = substr(creation_date,1,4)) %>%
filter(year == "2017") %>%
filter(where_is_the_graffiti_located_ %in%
c("Front","Rear","Side")) %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X","Y"), crs=4326, agr="constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Graffiti")
# streetLightsOut <-
# read.socrata(paste0("https://data.cityofchicago.org/Service",
# "-Requests/311-Service-Requests-Street-Lights-All-Out/",
# "zuxi-7xem")) %>%
# mutate(year = substr(creation_date,1,4)) %>%
# filter(year == "2017") %>%
# dplyr::select(Y = latitude, X = longitude) %>%
# na.omit() %>%
# st_as_sf(coords = c("X","Y"), crs=4326, agr="constant") %>%
# st_transform(st_crs(fishnet)) %>%
# mutate(Legend = "Street_Lights_Out")
liquorRetail <-
read.socrata(paste0("https://data.cityofchicago.org/resource/",
"nrmj-3kcf.json")) %>%
filter(business_activity ==
"Retail Sales of Packaged Liquor") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X","Y"), crs=4326, agr="constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Liquor_Retail")
foodInspectFail <-
read.socrata("https://data.cityofchicago.org/Health-Human-Services/Food-Inspections/4ijn-s7e5") %>%
mutate(year = substr(inspection_date,1,4)) %>% filter(year == "2017") %>%
# filter(results=="Fail") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Food_Inspect")
The multi-map below shows risk factors on a map.
vars_net <- rbind(rodentBait, sanitation311, ordinanceViolation, graffiti, liquorRetail, foodInspectFail) %>%
st_join(., fishnet, join=st_within) %>%
st_drop_geometry() %>%
group_by(uniqueID, Legend) %>%
summarize(count = n()) %>%
full_join(fishnet, by = "uniqueID") %>%
spread(Legend, count, fill=0) %>%
st_sf() %>%
dplyr::select(-`<NA>`) %>%
na.omit() %>%
ungroup()
vars_net.long <-
gather(vars_net, Variable, value, -geometry, -uniqueID)
vars <- unique(vars_net.long$Variable)
mapList1 <- list()
for(i in vars){
mapList1[[i]] <-
ggplot() +
geom_sf(data = filter(vars_net.long, Variable == i),
aes(fill=value), colour=NA) +
scale_fill_viridis(name="") +
labs(title=i) +
mapTheme()}
do.call(grid.arrange,c(mapList1, ncol=3,
top="Risk Factors by Fishnet"))
The next chunk of code converts risk factors to nearest neighbor distances. This feature allows us to look at risk factors in a spatially “smoothed” manner. For all risk factors we pick k=3, indicating that we want an average distance of nearest 3 occurrences of the event. The nearest neighbor plots follow:
# convinience to reduce length of function names.
st_c <- st_coordinates
st_coid <- st_centroid
## create NN from rodent bait
vars_net <- vars_net %>%
mutate(rodent_Bait.nn = nn_function(st_c(st_coid(vars_net)),
st_c(rodentBait),
k = 3))
## create NN from sanitation 311
vars_net <- vars_net %>%
mutate(sanitation_311.nn = nn_function(st_c(st_coid(vars_net)),
st_c(sanitation311),
k = 3))
## create NN from ordinance violations
vars_net <- vars_net %>%
mutate(ordinance_Violation.nn = nn_function(st_c(st_coid(vars_net)),
st_c(ordinanceViolation),
k = 3))
## create NN from graffiti
vars_net <- vars_net %>%
mutate(graffiti.nn = nn_function(st_c(st_coid(vars_net)),
st_c(graffiti),
k = 3))
## create NN from liquorRetail
vars_net <- vars_net %>%
mutate(liquor_Retail.nn = nn_function(st_c(st_coid(vars_net)),
st_c(liquorRetail),
k = 3))
## create NN from food_Inspect_Fail
vars_net <- vars_net %>%
mutate(food_Inspect_Fail.nn = nn_function(st_c(st_coid(vars_net)),
st_c(foodInspectFail),
k = 3))
vars_net.long.nn <-
dplyr::select(vars_net, ends_with(".nn")) %>%
gather(Variable, value, -geometry)
vars <- unique(vars_net.long.nn$Variable)
mapList2 <- list()
for(i in vars){
mapList2[[i]] <-
ggplot() +
geom_sf(data = filter(vars_net.long.nn, Variable == i),
aes(fill=value), colour=NA) +
scale_fill_viridis(name="") +
labs(title=i) +
mapTheme()}
do.call(grid.arrange,c(mapList2, ncol = 3,
top = "Nearest Neighbor risk Factors by Fishnet"))
#Distance to Loop to be used later
loopPoint <-
filter(neighborhoods, name == "Loop") %>%
st_centroid()
vars_net$loopDistance =
st_distance(st_centroid(vars_net), loopPoint) %>%
as.numeric()
The plot below combines risk factors into a single map.
## Visualize the NN feature
vars_net.long.nn <-
dplyr::select(vars_net, ends_with(".nn")) %>%
gather(Variable, value, -geometry)
ggplot() +
geom_sf(data = vars_net.long.nn, aes(fill=value), colour=NA) +
scale_fill_viridis(name="NN Distance") +
labs(title="Risk Factors NN Distance") +
mapTheme()
Since the counts were aggregated to each cell by uniqueID we can use that to join the counts to the fishnet.
## important to drop the geometry from joining features
final_net <-
left_join(inspect_net, st_drop_geometry(vars_net), by="uniqueID")
We use spatial joins to attach centroids of fishnets to polygon for neighborhoods and police districts. The map below shows the fishnet as police districts.
final_net <-
st_centroid(final_net) %>%
st_join(dplyr::select(neighborhoods, name), by = "uniqueID") %>%
st_drop_geometry() %>%
left_join(dplyr::select(final_net, geometry, uniqueID)) %>%
st_sf() %>%
na.omit()
# for live demo
mapview::mapview(final_net, zcol = "name")
## generates warnings from PROJ issues
## {spdep} to make polygon to neighborhoods...
final_net.nb <- poly2nb(as_Spatial(final_net), queen=TRUE)
## ... and neighborhoods to list of weigths
final_net.weights <- nb2listw(final_net.nb, style="W", zero.policy=TRUE)
# print(final_net.weights, zero.policy=TRUE)
The grid map below shows the following four elements:
## see ?localmoran
local_morans <- localmoran(final_net$countInspect, final_net.weights, zero.policy=TRUE) %>%
as.data.frame()
# join local Moran's I results to fishnet
final_net.localMorans <-
cbind(local_morans, as.data.frame(final_net)) %>%
st_sf() %>%
dplyr::select(countInspect = countInspect,
Local_Morans_I = Ii,
P_Value = `Pr(z != E(Ii))`) %>%
mutate(Significant_Hotspots = ifelse(P_Value <= 0.001, 1, 0)) %>%
gather(Variable, Value, -geometry)
# varList1 <- list()
#
# for(i in vars){
# varList1[[i]] <-
# ggplot() +
# geom_sf(data = filter(final_net.localMorans, Variable==i),
# aes(fill = Value), colour=NA) +
# scale_fill_viridis(name="") +
# labs(title=i) +
# mapTheme() + theme(legend.position="bottom")}
#
# do.call(grid.arrange,c(varList1, ncol = 2,
# top = "Local Morans I statistics, Theft"))
## This is just for plotting
vars <- unique(final_net.localMorans$Variable)
varList2 <- list()
for(i in vars){
varList2[[i]] <-
ggplot() +
geom_sf(data = filter(final_net.localMorans, Variable == i),
aes(fill = Value), colour=NA) +
scale_fill_viridis(name="") +
labs(title=i) +
mapTheme(title_size = 14) + theme(legend.position="bottom")}
do.call(grid.arrange,c(varList2, ncol = 4, top = "Local Morans I statistics, Inspect"))
Once again, we’ll use the nearest neighbor function to find the distance to a hot spot location.
# generates warning from NN
final_net <- final_net %>%
mutate(Inspect.isSig =
ifelse(local_morans[,5] <= 0.001, 1, 0)) %>%
mutate(Inspect.isSig.dist =
nn_function(st_c(st_coid(final_net)),
st_c(st_coid(filter(final_net,
Inspect.isSig == 1))),
k = 1))
ggplot() +
geom_sf(data = final_net, aes(fill=Inspect.isSig.dist), colour=NA) +
scale_fill_viridis(name="Inspect.isSig.dist") +
labs(title="Distance to Highly Significant Theft Hotspots") +
mapTheme()
### Distance to the Loop
We’ll likewise consider the distance to the main business area and the center of Chicago–the Loop. See map below.
ggplot() +
geom_sf(data = vars_net, aes(fill=loopDistance), colour=NA) +
scale_fill_viridis(name="loopDistance") +
labs(title="Distance to the Loop") +
mapTheme()
The plots below shows correlations between various features of risk factors and theft in Chicago. The plot highlights that the presence of restaurants with inspection failures and retail liquor locations are potentially significant factors that contribute to theft.
correlation.long <-
st_drop_geometry(final_net) %>%
dplyr::select(-uniqueID, -cvID, -name, -loopDistance, -Inspect.isSig, -Inspect.isSig.dist) %>%
gather(Variable, Value, -countInspect)
correlation.cor <-
correlation.long %>%
group_by(Variable) %>%
summarize(correlation = cor(Value, countInspect, use = "complete.obs"))
ggplot(correlation.long, aes(Value, countInspect)) +
geom_point(size = 0.1) +
geom_text(data = correlation.cor,
aes(label = paste("r =", round(correlation, 2))),
x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
geom_smooth(method = "lm", se = FALSE, colour = "black") +
facet_wrap(~Variable, ncol = 2, scales = "free") +
labs(title = "Failed inspection count as a function of risk factors") +
plotTheme()
The histogram below provides a view of the frequency of the dependent variables.
# calculate errors by NEIGHBORHOOD
HistogramInspect <-
final_net %>%
group_by(cvID) %>%
summarize(countInspect_sum = sum(countInspect, na.rm = T),
countInspect_mean = mean(countInspect, na.rm = T)) %>%
ungroup()
# error_by_reg_and_fold %>%
# arrange(desc(MAE))
# error_by_reg_and_fold %>%
# arrange(MAE)
## plot histogram of theft
HistogramInspect %>%
ggplot(aes(countInspect_sum)) +
geom_histogram(bins = 30, colour="black", fill = "#FDE725FF") +
scale_x_continuous(breaks = seq(30, 475, by = 25)) +
labs(title="Distribution of Failed Inspection Counts", subtitle = "Chicago, IL",
x="Theft Count", y="Count")
The model below uses all six risk factors as nearest neighbor functions, along with spatial features where theft is significant, as independent variables. Count of theft is taken as the dependent variable. The model cycles through each neighborhood as a hold out, then trains on the remaining cells. Cross-validation is then used to generate predictions for the hold out.
We generate four variations of the model: risk factors with and without spatial features, k-fold and spatial cross-validation.
# necessary workaround for hardcoded "countBurglaries"
# redefine function
remove(crossValidate)
crossValidate <- function(dataset, id, dependentVariable, indVariables) {
allPredictions <- data.frame()
cvID_list <- unique(dataset[[id]])
for (i in cvID_list) {
thisFold <- i
#cat("This hold out fold is", thisFold, "\n")
fold.train <- filter(dataset, dataset[[id]] != thisFold) %>% as.data.frame() %>%
dplyr::select(id, geometry, indVariables, dependentVariable)
fold.test <- filter(dataset, dataset[[id]] == thisFold) %>% as.data.frame() %>%
dplyr::select(id, geometry, indVariables, dependentVariable)
regression <-
glm(paste0(dependentVariable,"~."), family = "poisson",
data = fold.train %>%
dplyr::select(-geometry, -id))
thisPrediction <-
mutate(fold.test, Prediction = predict(regression, fold.test, type = "response"))
allPredictions <-
rbind(allPredictions, thisPrediction)
}
return(st_sf(allPredictions))
}
# View(crossValidate)
## define independent variables
reg.ss.vars <- c("rodent_Bait.nn", "sanitation_311.nn","ordinance_Violation.nn", "graffiti.nn", "liquor_Retail.nn", "food_Inspect_Fail.nn", "Inspect.isSig", "Inspect.isSig.dist", "loopDistance")
## define independent variables with spatial features
reg.vars <-
c("rodent_Bait.nn", "sanitation_311.nn","ordinance_Violation.nn", "graffiti.nn", "liquor_Retail.nn", "food_Inspect_Fail.nn")
## RUN REGRESSIONS
reg.ss.spatialCV <- crossValidate(
dataset = dplyr::rename(final_net, countFailures = countInspect),
id = "name",
dependentVariable = "countFailures",
indVariables = reg.ss.vars) %>%
dplyr::select(cvID = name, countFailures, Prediction, geometry)
reg.cv <- crossValidate(
dataset = dplyr::rename(final_net, countFailures = countInspect),
id = "cvID",
dependentVariable = "countFailures",
indVariables = reg.vars) %>%
dplyr::select(cvID = cvID, countFailures, Prediction, geometry)
reg.ss.cv <- crossValidate(
dataset = dplyr::rename(final_net, countFailures = countInspect),
id = "cvID",
dependentVariable = "countFailures",
indVariables = reg.ss.vars) %>%
dplyr::select(cvID = cvID, countFailures, Prediction, geometry)
reg.spatialCV <- crossValidate(
dataset = dplyr::rename(final_net, countFailures = countInspect),
id = "name",
dependentVariable = "countFailures",
indVariables = reg.vars) %>%
dplyr::select(cvID = name, countFailures, Prediction, geometry)
Model errors are plotted in a histogram below. The mode of mean absolute errors is 2 thefts. The model can benefit from further refinement based on the presence of neighborhoods with significant errors. Most grievous errors occur in neighborhoods near the Loop and the touristy Millennium Park, presumably areas with high prevelance of theft under $500.
Comparing mean errors between models shows that spatial features improve our models significantly, compared to using just the risk factors. The comparison map confirms this finding.
reg.summary <-rbind(
mutate(reg.cv,
Error = Prediction - countFailures,
Regression = "Random k-fold CV: Just Risk Factors"),
mutate(reg.ss.cv,
Error = Prediction - countFailures,
Regression = "Random k-fold CV: Spatial Process"),
mutate(reg.spatialCV,
Error = Prediction - countFailures,
Regression = "Spatial LOGO-CV: Just Risk Factors"),
mutate(reg.ss.spatialCV,
Error = Prediction - countFailures,
Regression = "Spatial LOGO-CV: Spatial Process")) %>%
st_sf()
# calculate errors by NEIGHBORHOOD
error_by_reg_and_fold <-
reg.summary %>%
group_by(Regression, cvID) %>%
summarize(Mean_Error = mean(Prediction - countFailures, na.rm = T),
MAE = mean(abs(Mean_Error), na.rm = T),
SD_MAE = mean(abs(Mean_Error), na.rm = T)) %>%
ungroup()
error_by_reg_and_fold %>%
arrange(desc(MAE))
## Simple feature collection with 396 features and 5 fields
## geometry type: GEOMETRY
## dimension: XY
## bbox: xmin: 341164.1 ymin: 552875.4 xmax: 367164.1 ymax: 594875.4
## projected CRS: NAD_1983_HARN_StatePlane_Illinois_East_FIPS_1201
## # A tibble: 396 x 6
## Regression cvID Mean_Error MAE SD_MAE geometry
## <chr> <chr> <dbl> <dbl> <dbl> <GEOMETRY [m]>
## 1 Spatial LOGO-… Loop -76.6 76.6 76.6 POLYGON ((358164.1 578375.4, 3…
## 2 Spatial LOGO-… Loop -51.5 51.5 51.5 POLYGON ((358164.1 578375.4, 3…
## 3 Spatial LOGO-… Stree… -46.4 46.4 46.4 MULTIPOLYGON (((359164.1 57987…
## 4 Spatial LOGO-… Boyst… -40.6 40.6 40.6 POLYGON ((356664.1 585875.4, 3…
## 5 Spatial LOGO-… Mille… -35.9 35.9 35.9 POLYGON ((359164.1 578875.4, 3…
## 6 Spatial LOGO-… Print… 32.9 32.9 32.9 POLYGON ((358664.1 577875.4, 3…
## 7 Spatial LOGO-… Wrigl… -32.9 32.9 32.9 POLYGON ((356164.1 586875.4, 3…
## 8 Spatial LOGO-… Rush … -32.6 32.6 32.6 POLYGON ((358664.1 581375.4, 3…
## 9 Spatial LOGO-… Greek… 30.4 30.4 30.4 POLYGON ((357164.1 578875.4, 3…
## 10 Spatial LOGO-… Stree… -26.9 26.9 26.9 MULTIPOLYGON (((359164.1 57987…
## # … with 386 more rows
# error_by_reg_and_fold %>%
# arrange(MAE)
## plot histogram of OOF (out of fold) errors
error_by_reg_and_fold %>%
ggplot(aes(MAE)) +
geom_histogram(bins = 30, colour="black", fill="#FDE725FF") +
facet_wrap(~Regression) +
geom_vline(xintercept = 0) + scale_x_continuous(
breaks = seq(0, 8, by = 1)) +
labs(title="Distribution of MAE",
subtitle = "k-fold cross-validation vs. LOGO-CV",
x="Mean Absolute Error", y="Count") +
plotTheme()
## Errors by Model
st_drop_geometry(error_by_reg_and_fold) %>%
group_by(Regression) %>%
summarize(Mean_MAE = round(mean(MAE), 2),
SD_MAE = round(sd(MAE), 2)) %>%
kable() %>%
kable_styling("striped", full_width = F) %>%
row_spec(2, color = "black", background = "#FDE725FF") %>%
row_spec(4, color = "black", background = "#FDE725FF")
| Regression | Mean_MAE | SD_MAE |
|---|---|---|
| Random k-fold CV: Just Risk Factors | 1.72 | 1.78 |
| Random k-fold CV: Spatial Process | 1.75 | 1.62 |
| Spatial LOGO-CV: Just Risk Factors | 5.56 | 11.36 |
| Spatial LOGO-CV: Spatial Process | 4.39 | 8.15 |
## map of errors by neighborhood
error_by_reg_and_fold %>%
filter(str_detect(Regression, "LOGO")) %>%
ggplot() +
geom_sf(aes(fill = MAE)) +
facet_wrap(~Regression) +
scale_fill_viridis() +
labs(title = "Theft errors by LOGO-CV Regression") +
mapTheme() + theme(legend.position="bottom")
## neighborhood weights
neighborhood.weights <-
filter(error_by_reg_and_fold,
Regression == "Spatial LOGO-CV: Spatial Process") %>%
group_by(cvID) %>%
poly2nb(as_Spatial(.), queen=TRUE) %>%
nb2listw(., style="W", zero.policy=TRUE)
filter(error_by_reg_and_fold, str_detect(Regression, "LOGO")) %>%
st_drop_geometry() %>%
group_by(Regression) %>%
summarize(Morans_I =
moran.mc(abs(Mean_Error), neighborhood.weights,
nsim = 999, zero.policy = TRUE,
na.action=na.omit)[[1]],
p_value =
moran.mc(abs(Mean_Error), neighborhood.weights,
nsim = 999, zero.policy = TRUE,
na.action=na.omit)[[3]])
## # A tibble: 2 x 3
## Regression Morans_I p_value
## * <chr> <dbl> <dbl>
## 1 Spatial LOGO-CV: Just Risk Factors 0.373 0.001
## 2 Spatial LOGO-CV: Spatial Process 0.339 0.001
Next we’ll use kernel density with varying search radii: 1000, 1500 and 2000 feet. Map below shows the visualization by 3 different search radii.
# demo of kernel width
inspect_ppp <- as.ppp(st_coordinates(FoodInspect), W = st_bbox(final_net))
inspect_KD.1000 <- spatstat.core::density.ppp(inspect_ppp, 1000)
inspect_KD.1500 <- spatstat.core::density.ppp(inspect_ppp, 1500)
inspect_KD.2000 <- spatstat.core::density.ppp(inspect_ppp, 2000)
inspect_KD.df <- rbind(
mutate(data.frame(rasterToPoints(mask(raster(inspect_KD.1000), as(neighborhoods, 'Spatial')))), Legend = "1000 Ft."),
mutate(data.frame(rasterToPoints(mask(raster(inspect_KD.1500), as(neighborhoods, 'Spatial')))), Legend = "1500 Ft."),
mutate(data.frame(rasterToPoints(mask(raster(inspect_KD.2000), as(neighborhoods, 'Spatial')))), Legend = "2000 Ft."))
inspect_KD.df$Legend <- factor(inspect_KD.df$Legend, levels = c("1000 Ft.", "1500 Ft.", "2000 Ft."))
ggplot(data=inspect_KD.df, aes(x=x, y=y)) +
geom_raster(aes(fill=layer)) +
facet_wrap(~Legend) +
coord_sf(crs=st_crs(final_net)) +
scale_fill_viridis(name="Density") +
labs(title = "Kernel density with 3 different search radii") +
mapTheme(title_size = 14)
Kernel density can be viewed on the fishnet with 1500 random theft observations.
as.data.frame(inspect_KD.1000) %>%
st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
aggregate(., final_net, mean) %>%
ggplot() +
geom_sf(aes(fill=value)) +
geom_sf(data = sample_n(FoodInspect, 1500), size = .5) +
scale_fill_viridis(name = "Density") +
labs(title = "Kernel density of 2017 thefts") +
mapTheme(title_size = 14)
To evaluate the generalizability of our model we’ll apply our predictions and compare the results to 2018 theft statistics.
FoodInspect19 <-
read.socrata("https://data.cityofchicago.org/Health-Human-Services/Food-Inspections/4ijn-s7e5") %>%
mutate(year = substr(inspection_date,1,4)) %>% filter(year == "2019") %>%
# filter(results=="Fail") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Food_Inspect")
ACS data allows us to evaluate generalizability by race contexts. The chunk below compares errors for majority white versus majority non-white neighborhoods. Table below shows that we are generally under-predicting in non-white neighborhoods and over-predicting in majority white neighborhoods. Additionally, the model with spatial process generates smaller errors across neighborhood contexts.
inspect_KDE_sf <- as.data.frame(inspect_KD.1000) %>%
st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
aggregate(., final_net, mean) %>%
mutate(label = "Kernel Density",
Risk_Category = ntile(value, 100),
Risk_Category = case_when(
Risk_Category >= 90 ~ "90% to 100%",
Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
cbind(
aggregate(
dplyr::select(FoodInspect19) %>% mutate(inspectCount = 1), ., sum) %>%
mutate(inspectCount = replace_na(inspectCount, 0))) %>%
dplyr::select(label, Risk_Category, inspectCount)
inspect_risk_sf <-
reg.ss.spatialCV %>%
mutate(label = "Risk Predictions",
Risk_Category = ntile(Prediction, 100),
Risk_Category = case_when(
Risk_Category >= 90 ~ "90% to 100%",
Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
cbind(
aggregate(
dplyr::select(FoodInspect19) %>% mutate(inspectCount = 1), ., sum) %>%
mutate(inspectCount = replace_na(inspectCount, 0))) %>%
dplyr::select(label,Risk_Category, inspectCount)
rbind(inspect_KDE_sf, inspect_risk_sf) %>%
na.omit() %>%
gather(Variable, Value, -label, -Risk_Category, -geometry) %>%
ggplot() +
geom_sf(aes(fill = Risk_Category), colour = NA) +
geom_sf(data = sample_n(FoodInspect19, 3000), size = .5, colour = "black") +
facet_wrap(~label, ) +
scale_fill_viridis(discrete = TRUE) +
labs(title="Comparison of Kernel Density and Risk Predictions",
subtitle="2017 theft risk predictions; 2018 theft") +
mapTheme(title_size = 14)
Ultimately, we are able to compare our risk predictions versus kernel density of 2018 thefts. We are especially interested in improvements in the high risk categories. A well fit model will show the highest risk categories capturing points of high density of theft. In the table below we see that risk predictions are slightly higher than kernel density in the highest risk category of 90 to 100% (the rightmost bar). This indicates that our model is able to capture the latent risk despite relatively few observed thefts. This dynamic can also be observed in the medium risk category of 50 to 69%.
With increased sophistication in engineered features we would expect our model to improve its ability to capture latent risk, thus increasing its predictive power. An important consideration for a policymaker is the definite presence of racial bias in the risk factors. It would not be prudent to “ignore” racial features in building the model with the hope of avoiding this bias, for the issue is baked into the data even if we exclude race variables. This is due to a number of complicated social factors such as inconsistent reporting and enforcement across neighborhood characteristics. Implicit bias will always be present and it is the job of the data scientist to discern and minimize its impact on the predictions. One proposal made my data scientists at Yale is to include racial features in the training of the model but exclude it in predictions (Source:Yale Insights .
rbind(inspect_KDE_sf, inspect_risk_sf) %>%
st_set_geometry(NULL) %>% na.omit() %>%
gather(Variable, Value, -label, -Risk_Category) %>%
group_by(label, Risk_Category) %>%
summarize(countInspect = sum(Value)) %>%
ungroup() %>%
group_by(label) %>%
mutate(Rate_of_test_set_crimes = countInspect / sum(countInspect)) %>%
ggplot(aes(Risk_Category,Rate_of_test_set_crimes)) +
geom_bar(aes(fill=label), position="dodge", stat="identity") +
scale_fill_viridis(discrete = TRUE) +
labs(title = "Risk prediction vs. Kernel density, 2018 thefts") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5))